{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tensorflow CNN with my own pre-trained vectors (by CBOW)\n",
    "- CBOW embeddings come from Tensorflow generated embeddings\n",
    "- Tensorflow\n",
    "- Modeled from: https://github.com/huseinzol05/Text-Classification-Comparison/blob/master/Deep-learning/cnn-vector.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ellfae/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import preprocessing, metrics, decomposition, pipeline, dummy\n",
    "from nltk.tokenize import TweetTokenizer\n",
    "import os\n",
    "\n",
    "import tensorflow as tf\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "\n",
    "\n",
    "import helpers.preprocessing as prep\n",
    "import helpers.regprep as regprep\n",
    "# to perform evaluations (new one - mines)\n",
    "import helpers.evaluate as ev\n",
    "evaluator = ev.Evaluate()\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import helpers.pickle_helpers as ph\n",
    "import gensim\n",
    "from gensim.models import KeyedVectors\n",
    "import time\n",
    "import math\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import re\n",
    "import helpers.husein as H"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = ph.load_from_pickle(directory=\"data/husein_emotion/emotion-english/merged_training.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data.text = train_data.text.apply(lambda d: H.clearstring(d))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data, test_data, val_data = prep.split_original(train_data, 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Sampling\n",
    "train_data = train_data.sample(n=10000, random_state=10).copy()\n",
    "test_data = test_data.sample(n=10000, random_state=10).copy()\n",
    "val_data = val_data.sample(n=10000, random_state=10).copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pre-trained word embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "### load word embeddings and accompanying vocabulary\n",
    "wv = ph.load_from_pickle(\"data/husein_emotion/tf_embeddings/tf_cbow_embeddings.p\")\n",
    "vocab = ph.load_from_pickle(\"data/husein_emotion/tf_embeddings/tf_cbow_dictionary.p\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenization and Label Binarization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_unknown_words(tokens):\n",
    "    return [t for t in tokens if t in vocab]\n",
    "\n",
    "def check_size(c, size):\n",
    "    if len(c) <= size:\n",
    "        return False\n",
    "    else:\n",
    "        return True\n",
    "    \n",
    "### tokens and tokensize\n",
    "train_data[\"tokens\"] = train_data.text.apply(lambda t: remove_unknown_words(t.split()))\n",
    "train_data[\"tokensize\"] = train_data.tokens.apply(lambda t: len(t))\n",
    "test_data[\"tokens\"] = test_data.text.apply(lambda t: remove_unknown_words(t.split()))\n",
    "test_data[\"tokensize\"] = test_data.tokens.apply(lambda t: len(t))\n",
    "val_data[\"tokens\"] = val_data.text.apply(lambda t: remove_unknown_words(t.split()))\n",
    "val_data[\"tokensize\"] = val_data.tokens.apply(lambda t: len(t))\n",
    "\n",
    "\n",
    "### filter by tokensize\n",
    "train_data = train_data.loc[train_data[\"tokens\"].apply(lambda d: check_size(d, 7)) != False].copy()\n",
    "test_data = test_data.loc[test_data[\"tokens\"].apply(lambda d: check_size(d, 7)) != False].copy()\n",
    "val_data = val_data.loc[val_data[\"tokens\"].apply(lambda d: check_size(d, 7)) != False].copy()\n",
    "\n",
    "### sorting by tokensize (not needed for CNN because bucketing is not needed)\n",
    "#train_data.sort_values(by=\"tokensize\", ascending=True, inplace=True)\n",
    "#test_data.sort_values(by=\"tokensize\", ascending=True, inplace=True)\n",
    "#val_data.sort_values(by=\"tokensize\", ascending=True, inplace=True)\n",
    "\n",
    "### resetting index\n",
    "train_data.reset_index(drop=True, inplace=True);\n",
    "test_data.reset_index(drop=True, inplace=True);\n",
    "val_data.reset_index(drop=True, inplace=True);\n",
    "\n",
    "### Binarization\n",
    "emotions = list(set(train_data.emotions.unique()))\n",
    "num_emotions = len(emotions)\n",
    "\n",
    "# binarizer\n",
    "mlb = preprocessing.MultiLabelBinarizer()\n",
    "\n",
    "train_data_labels =  [set(emos) & set(emotions) for emos in train_data[['emotions']].values]\n",
    "test_data_labels =  [set(emos) & set(emotions) for emos in test_data[['emotions']].values]\n",
    "val_data_labels =  [set(emos) & set(emotions) for emos in val_data[['emotions']].values]\n",
    "\n",
    "y_bin_emotions = mlb.fit_transform(train_data_labels)\n",
    "test_y_bin_emotions = mlb.fit_transform(test_data_labels)\n",
    "val_y_bin_emotions = mlb.fit_transform(val_data_labels)\n",
    "\n",
    "train_data['bin_emotions'] = y_bin_emotions.tolist()\n",
    "test_data['bin_emotions'] = test_y_bin_emotions.tolist()\n",
    "val_data['bin_emotions'] = val_y_bin_emotions.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Batching by Bucketing appraoch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "### TODO: put this into data helpers\n",
    "### renders embeddings with paddings; zeros where missing tokens\n",
    "def generate_embeds_with_pads(tokens, max_size):\n",
    "   \n",
    "    padded_embedding = []\n",
    "    for i in range(max_size):\n",
    "        if i+1 > len(tokens): # do padding\n",
    "            padded_embedding.append(list(np.zeros(EMBEDDING_DIM)))\n",
    "        else: # do embedding for existing tokens\n",
    "            padded_embedding.append(list(wv[vocab[tokens[i]]]))  \n",
    "    return padded_embedding\n",
    "\n",
    "### generate the actual batches\n",
    "def generate_batches(data, batch_size):\n",
    "    actual_batches = math.ceil(len(data) / batch_size)\n",
    "    bins = np.linspace(0, len(data), actual_batches + 1) # this renders actual batches bins of size batch_size\n",
    "    groups = data.groupby(np.digitize(data.index, bins))\n",
    "    \n",
    "    groups_indices = groups.indices\n",
    "    groups_maxes = groups.max().tokensize\n",
    "    \n",
    "    return groups.indices, groups_maxes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CNN Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_LEN = 20 # needed to ensure equal size of input\n",
    "location = os.getcwd()\n",
    "EMBEDDING_DIM = 128\n",
    "NUM_LAYERS = 3\n",
    "size_layer = 256\n",
    "learning_rate = 0.0001\n",
    "BATCH_SIZE = 100\n",
    "filter_sizes = [2,3,4,5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CNN:\n",
    "    def __init__(self, sequence_length, dimension_input, dimension_output, \n",
    "                 learning_rate, filter_sizes, out_dimension):\n",
    "        \n",
    "        self.X = tf.placeholder(tf.float32, shape=[None, sequence_length, dimension_input, 1])\n",
    "        self.Y = tf.placeholder(tf.float32, shape=[None, dimension_output])\n",
    "        \n",
    "        pooled_outputs = []\n",
    "        \n",
    "        for i in filter_sizes:\n",
    "            w = tf.Variable(tf.truncated_normal([i, dimension_input, 1, out_dimension], stddev=0.1)) #\n",
    "            b = tf.Variable(tf.truncated_normal([out_dimension], stddev = 0.01))\n",
    "            conv = tf.nn.relu(tf.nn.conv2d(self.X, w, strides=[1, 1, 1, 1],padding=\"VALID\") + b) # [N, seq_len-1, 1, out_dim]\n",
    "            pooled = tf.nn.max_pool(conv,ksize=[1, sequence_length - i + 1, 1, 1],strides=[1, 1, 1, 1],padding='VALID') # [N, 1, 1, out_dim]\n",
    "            pooled_outputs.append(pooled)\n",
    "        \n",
    "        h_pool = tf.concat(pooled_outputs, 3) # [N, 1, 1, out_dim*len(filter_sizes)]\n",
    "        h_pool_flat = tf.nn.dropout(tf.reshape(h_pool, [-1, out_dimension * len(filter_sizes)]), 0.1) # [N, out_dim*len(filter_sizes)] \n",
    "        \n",
    "        w = tf.Variable(tf.truncated_normal([out_dimension * len(filter_sizes), dimension_output], stddev=0.1))\n",
    "        b = tf.Variable(tf.truncated_normal([dimension_output], stddev = 0.01))\n",
    "        self.logits = tf.matmul(h_pool_flat, w) + b #FC\n",
    "        \n",
    "        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))\n",
    "        l2 = sum(0.0005 * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())\n",
    "        self.cost += l2\n",
    "        \n",
    "        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)\n",
    "        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "### define model\n",
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "#model = Model(maxlen, dimension, len(label), learning_rate, filter_sizes, size_layer)\n",
    "model = CNN(MAX_LEN, EMBEDDING_DIM, num_emotions, learning_rate, filter_sizes, size_layer)\n",
    "sess.run(tf.global_variables_initializer())\n",
    "dimension = EMBEDDING_DIM\n",
    "saver = tf.train.Saver(tf.global_variables())\n",
    "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 10, 0, 0, 0\n",
    "\n",
    "### defining batch generation\n",
    "train_groups_indices, train_groups_maxes = generate_batches(train_data, BATCH_SIZE)\n",
    "test_groups_indices, test_groups_maxes = generate_batches(test_data, BATCH_SIZE)\n",
    "val_groups_indices, val_groups_maxes = generate_batches(val_data, BATCH_SIZE)\n",
    "\n",
    "n_train = len(train_data) // BATCH_SIZE\n",
    "n_test = len(test_data) // BATCH_SIZE\n",
    "n_val = len(val_data) // BATCH_SIZE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0 , pass acc: 0 , current acc: 0.38328013924712484\n",
      "time taken: 111.49427223205566\n",
      "epoch: 1 , training loss: 2.7404070660606035 , training acc: 0.31944340576511715 , valid loss: 2.3440612626798227 , valid acc: 0.38328013924712484\n",
      "epoch: 1 , pass acc: 0.38328013924712484 , current acc: 0.47112449089234526\n",
      "time taken: 99.59676480293274\n",
      "epoch: 2 , training loss: 2.1953228446105713 , training acc: 0.43311172963769895 , valid loss: 2.057679238192963 , valid acc: 0.47112449089234526\n",
      "epoch: 2 , pass acc: 0.47112449089234526 , current acc: 0.5456565468374527\n",
      "time taken: 99.25273585319519\n",
      "epoch: 3 , training loss: 1.9006530315262737 , training acc: 0.5105206305553766 , valid loss: 1.759758419159687 , valid acc: 0.5456565468374527\n",
      "epoch: 3 , pass acc: 0.5456565468374527 , current acc: 0.6105387018937053\n",
      "time taken: 98.94453477859497\n",
      "epoch: 4 , training loss: 1.6265683773850332 , training acc: 0.5837434305223115 , valid loss: 1.5164523237582408 , valid acc: 0.6105387018937053\n",
      "epoch: 4 , pass acc: 0.6105387018937053 , current acc: 0.6670102341608568\n",
      "time taken: 99.34092974662781\n",
      "epoch: 5 , training loss: 1.4028360254106111 , training acc: 0.642700408107576 , valid loss: 1.309723995851748 , valid acc: 0.6670102341608568\n",
      "epoch: 5 , pass acc: 0.6670102341608568 , current acc: 0.7125734413663546\n",
      "time taken: 98.88827657699585\n",
      "epoch: 6 , training loss: 1.2251061134460355 , training acc: 0.6904545482839244 , valid loss: 1.156757531744061 , valid acc: 0.7125734413663546\n",
      "epoch: 6 , pass acc: 0.7125734413663546 , current acc: 0.742652263153683\n",
      "time taken: 98.9807059764862\n",
      "epoch: 7 , training loss: 1.0906997094048194 , training acc: 0.7288036489588341 , valid loss: 1.0374373226906315 , valid acc: 0.742652263153683\n",
      "epoch: 7 , pass acc: 0.742652263153683 , current acc: 0.7692129458441879\n",
      "time taken: 98.55075240135193\n",
      "epoch: 8 , training loss: 0.9814496111214754 , training acc: 0.7599724675738264 , valid loss: 0.9418572115175652 , valid acc: 0.7692129458441879\n",
      "epoch: 8 , pass acc: 0.7692129458441879 , current acc: 0.7899992152145414\n",
      "time taken: 99.1434531211853\n",
      "epoch: 9 , training loss: 0.8987082945741448 , training acc: 0.7827681179071478 , valid loss: 0.8706118247725747 , valid acc: 0.7899992152145414\n",
      "epoch: 9 , pass acc: 0.7899992152145414 , current acc: 0.8043816769213388\n",
      "time taken: 99.09578442573547\n",
      "epoch: 10 , training loss: 0.8314490201404342 , training acc: 0.7998282034947148 , valid loss: 0.8113973172325076 , valid acc: 0.8043816769213388\n",
      "epoch: 10 , pass acc: 0.8043816769213388 , current acc: 0.8135984657388745\n",
      "time taken: 99.02260065078735\n",
      "epoch: 11 , training loss: 0.7799709416407085 , training acc: 0.8136443843044189 , valid loss: 0.7646817108898452 , valid acc: 0.8135984657388745\n",
      "epoch: 11 , pass acc: 0.8135984657388745 , current acc: 0.826408773886435\n",
      "time taken: 99.33067464828491\n",
      "epoch: 12 , training loss: 0.7398124464819749 , training acc: 0.8242475780363278 , valid loss: 0.7282893648653319 , valid acc: 0.826408773886435\n",
      "epoch: 12 , pass acc: 0.826408773886435 , current acc: 0.8315396270517147\n",
      "time taken: 99.13869643211365\n",
      "epoch: 13 , training loss: 0.7082534141587963 , training acc: 0.8325606501254917 , valid loss: 0.7001974065646981 , valid acc: 0.8315396270517147\n",
      "epoch: 13 , pass acc: 0.8315396270517147 , current acc: 0.8396525830030441\n",
      "time taken: 99.01190447807312\n",
      "epoch: 14 , training loss: 0.6797877444807583 , training acc: 0.8396299724215187 , valid loss: 0.6740981223005237 , valid acc: 0.8396525830030441\n",
      "epoch: 14 , pass acc: 0.8396525830030441 , current acc: 0.8417420321793267\n",
      "time taken: 99.10474133491516\n",
      "epoch: 15 , training loss: 0.6586716393196769 , training acc: 0.8447727317609024 , valid loss: 0.6590975569278905 , valid acc: 0.8417420321793267\n",
      "epoch: 15 , pass acc: 0.8417420321793267 , current acc: 0.8471238818584066\n",
      "time taken: 99.28924250602722\n",
      "epoch: 16 , training loss: 0.6396419865889664 , training acc: 0.8481141509799696 , valid loss: 0.6371233932899706 , valid acc: 0.8471238818584066\n",
      "epoch: 16 , pass acc: 0.8471238818584066 , current acc: 0.8497803719206289\n",
      "time taken: 99.23088574409485\n",
      "epoch: 17 , training loss: 0.6252516990078394 , training acc: 0.8511497672278414 , valid loss: 0.6266002678735689 , valid acc: 0.8497803719206289\n",
      "epoch: 17 , pass acc: 0.8497803719206289 , current acc: 0.8520936591155601\n",
      "time taken: 98.95444130897522\n",
      "epoch: 18 , training loss: 0.6110860326599589 , training acc: 0.8541404544488337 , valid loss: 0.6147655753249471 , valid acc: 0.8520936591155601\n",
      "time taken: 99.13990068435669\n",
      "epoch: 19 , training loss: 0.601082578943782 , training acc: 0.8561458678281686 , valid loss: 0.6093109747902914 , valid acc: 0.8515625860203396\n",
      "epoch: 19 , pass acc: 0.8520936591155601 , current acc: 0.8534263031500758\n",
      "time taken: 99.37740874290466\n",
      "epoch: 20 , training loss: 0.5920784377136935 , training acc: 0.8580038148833925 , valid loss: 0.5999002879993482 , valid acc: 0.8534263031500758\n",
      "epoch: 20 , pass acc: 0.8534263031500758 , current acc: 0.8549487206972006\n",
      "time taken: 99.04343605041504\n",
      "epoch: 21 , training loss: 0.5850345461727713 , training acc: 0.8602433116274499 , valid loss: 0.5918926601157044 , valid acc: 0.8549487206972006\n",
      "epoch: 21 , pass acc: 0.8549487206972006 , current acc: 0.8571001638968786\n",
      "time taken: 99.31200790405273\n",
      "epoch: 22 , training loss: 0.5761035517712217 , training acc: 0.8614706987173621 , valid loss: 0.586471332632231 , valid acc: 0.8571001638968786\n",
      "epoch: 22 , pass acc: 0.8571001638968786 , current acc: 0.8587385169935949\n",
      "time taken: 99.13654255867004\n",
      "epoch: 23 , training loss: 0.5711308834056954 , training acc: 0.8627316740551926 , valid loss: 0.578551544610298 , valid acc: 0.8587385169935949\n",
      "epoch: 23 , pass acc: 0.8587385169935949 , current acc: 0.8604051886182843\n",
      "time taken: 99.00062704086304\n",
      "epoch: 24 , training loss: 0.5687064992229501 , training acc: 0.8635900920739619 , valid loss: 0.569358881901611 , valid acc: 0.8604051886182843\n",
      "time taken: 98.76369404792786\n",
      "epoch: 25 , training loss: 0.5644412964709502 , training acc: 0.8660450068371717 , valid loss: 0.575629172564456 , valid acc: 0.859682426759691\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-13-ff6a57a9c13e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     13\u001b[0m         \u001b[0mbatch_y\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtrain_groups_indices\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbin_emotions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m         \u001b[0mbatch_x\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpand_dims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m         \u001b[0;31m#print(batch_x)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/tensorflow/lib/python3.6/site-packages/numpy/lib/shape_base.py\u001b[0m in \u001b[0;36mexpand_dims\u001b[0;34m(a, axis)\u001b[0m\n\u001b[1;32m    312\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    313\u001b[0m     \"\"\"\n\u001b[0;32m--> 314\u001b[0;31m     \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    315\u001b[0m     \u001b[0mshape\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    316\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/tensorflow/lib/python3.6/site-packages/numpy/core/numeric.py\u001b[0m in \u001b[0;36masarray\u001b[0;34m(a, dtype, order)\u001b[0m\n\u001b[1;32m    490\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    491\u001b[0m     \"\"\"\n\u001b[0;32m--> 492\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    493\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    494\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "### training\n",
    "while True:\n",
    "    lasttime = time.time()\n",
    "    ### early stoping to avoid overfitting\n",
    "    if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
    "        print('break epoch:', EPOCH)\n",
    "        break\n",
    "    train_acc, train_loss, val_acc , val_loss = 0, 0, 0, 0\n",
    "    \n",
    "    for b in range(n_train):\n",
    "        batch_x = train_data.iloc[train_groups_indices[b+1]].tokens.apply(lambda d: \n",
    "                                                                          generate_embeds_with_pads(d, MAX_LEN) ).values.tolist()\n",
    "        batch_y = train_data.loc[train_groups_indices[b+1]].bin_emotions.values.tolist()\n",
    "        \n",
    "        batch_x = np.expand_dims(batch_x, axis=-1)\n",
    "        \n",
    "        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : batch_y})\n",
    "        train_loss += loss\n",
    "        train_acc += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : batch_y})\n",
    "        \n",
    "    for b in range(n_val):\n",
    "        batch_x = val_data.iloc[val_groups_indices[b+1]].tokens.apply(lambda d: \n",
    "                                                                          generate_embeds_with_pads(d, MAX_LEN) ).values.tolist()\n",
    "        batch_y = val_data.loc[val_groups_indices[b+1]].bin_emotions.values.tolist()\n",
    "        batch_x = np.expand_dims(batch_x, axis=-1)\n",
    "\n",
    "        loss, acc = sess.run([model.cost, model.accuracy], feed_dict = {model.X : batch_x, model.Y : batch_y})\n",
    "        val_loss += loss\n",
    "        val_acc += acc\n",
    "    \n",
    "    train_loss /= n_train\n",
    "    train_acc /= n_train\n",
    "    val_loss /= n_val\n",
    "    val_acc /= n_val\n",
    "    \n",
    "    if val_acc > CURRENT_ACC:\n",
    "        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', val_acc)\n",
    "        CURRENT_ACC = val_acc\n",
    "        CURRENT_CHECKPOINT = 0\n",
    "        saver.save(sess, os.getcwd() + \"/model/husein_emonet/model-cnn-vector.ckpt\")\n",
    "    else:\n",
    "        CURRENT_CHECKPOINT += 1\n",
    "    EPOCH += 1\n",
    "    print('time taken:', time.time()-lasttime)\n",
    "    print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', val_loss, ', valid acc:', val_acc)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
